Linear Models for Regression


In [ ]:
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np

In [ ]:
from sklearn.linear_model import Ridge, LinearRegression

In [ ]:
from sklearn.model_selection import cross_val_score

In [ ]:
from sklearn.datasets import load_boston
boston = load_boston()

In [ ]:
X, y = boston.data, boston.target

In [ ]:
X.shape

In [ ]:
fig, axes = plt.subplots(3, 5, figsize=(20, 10))
for i, ax in enumerate(axes.ravel()):
    if i > 12:
        ax.set_visible(False)
        continue
    ax.plot(X[:, i], y, 'o', alpha=.5)
    ax.set_title("{}: {}".format(i, boston.feature_names[i]))
    ax.set_ylabel("MEDV")

In [ ]:
print(X.shape)
print(y.shape)

In [ ]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

In [ ]:
np.mean(cross_val_score(LinearRegression(),
                        X_train, y_train, cv=10))

In [ ]:
np.mean(cross_val_score(
        Ridge(), X_train, y_train, cv=10))

In [ ]:
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': np.logspace(-3, 3, 14)}
print(param_grid)

In [ ]:
grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)
grid.fit(X_train, y_train)

In [ ]:
import pandas as pd
plt.figure(dpi=200)
results = pd.DataFrame(grid.cv_results_)
results.plot('param_alpha', 'mean_train_score', ax=plt.gca())
results.plot('param_alpha', 'mean_test_score', ax=plt.gca())

plt.legend()
plt.xscale("log")

In [ ]:
from sklearn.preprocessing import PolynomialFeatures, scale
# being lazy and not really doing things properly whoops
X_poly = PolynomialFeatures(include_bias=False).fit_transform(scale(X))
print(X_poly.shape)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=42)

In [ ]:
np.mean(cross_val_score(LinearRegression(),
                        X_train, y_train, cv=10))

In [ ]:
np.mean(cross_val_score(Ridge(),
                        X_train, y_train, cv=10))

In [ ]:
grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)
grid.fit(X_train, y_train)

In [ ]:
results = pd.DataFrame(grid.cv_results_)

results.plot('param_alpha', 'mean_train_score', ax=plt.gca())
results.plot('param_alpha', 'mean_test_score', ax=plt.gca())
plt.legend()
plt.xscale("log")

In [ ]:
print(grid.best_params_)
print(grid.best_score_)

In [ ]:
lr = LinearRegression().fit(X_train, y_train)
plt.scatter(range(X_poly.shape[1]), lr.coef_, c=np.sign(lr.coef_), cmap="bwr_r")

In [ ]:
ridge = grid.best_estimator_
plt.scatter(range(X_poly.shape[1]), ridge.coef_, c=np.sign(ridge.coef_), cmap="bwr_r")

In [ ]:
ridge100 = Ridge(alpha=100).fit(X_train, y_train)
ridge1 = Ridge(alpha=1).fit(X_train, y_train)
plt.figure(figsize=(8, 4))

plt.plot(ridge1.coef_, 'o', label="alpha=1")
plt.plot(ridge.coef_, 'o', label="alpha=14")
plt.plot(ridge100.coef_, 'o', label="alpha=100")
plt.legend()

In [ ]:
from sklearn.linear_model import Lasso

lasso = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

Exercise

Load the diabetes dataset using sklearn.datasets.load_diabetes. Apply LinearRegression, Ridge and Lasso and visualize the coefficients. Try polynomial features.


In [ ]:
# %load solutions/linear_models_diabetes.py